/*
 * pulp_morph.c
 * Tommaso Polonelli - tommaso.polonelli2@unibo.it
 *
 * Copyright (C) 2020 University of Bologna, Greenwaves Technolgies
 *
 * This software may be modified and distributed under the terms
 * of the MIT license.  See the LICENSE file for details.
 *
 * Created on: January 24, 2021
 *
 */
#include "pmsis.h"
#include "pulp_morph.h"


/*********** Constants  ****************/

/* Operation block size (number of pixel per operators)
 * this impact the L1 memory but decrease number of transfers L1 -> L3
 */

#define PERF		(0)

inline static void __attribute__((always_inline)) L2L2_cpy
(uint8_t *src, uint8_t *dst, int size, struct pi_device *dma_device)
{

	/* Init & open dmacpy. */
	struct pi_dmacpy_conf dmacpy_conf = {0};
	pi_dmacpy_conf_init(&dmacpy_conf);
	pi_open_from_conf(dma_device, &dmacpy_conf);
	if(pi_dmacpy_open(dma_device)){
		printf("DMA Error");
		pmsis_exit(-1);
	}
	uint32_t dmasize = size;
	dmasize = dmasize - (dmasize%4);
	pi_dmacpy_copy	(dma_device, src, dst, dmasize, PI_DMACPY_L2_L2);
	pi_dmacpy_close(dma_device);

}


inline static void __attribute__((always_inline)) cl_in(uint8_t *l2_in, uint8_t *l1_buffer, int sz){

	//pi_cl_team_critical_enter();

	pi_cl_dma_copy_t copy;

	copy.dir = PI_CL_DMA_DIR_EXT2LOC;
	copy.merge = 0;
	copy.size = (uint16_t) sz;
	copy.id = 0;
	copy.ext = (uint32_t) l2_in;
	copy.loc = (uint32_t) l1_buffer;

	pi_cl_dma_memcpy(&copy);
	pi_cl_dma_wait(&copy);

	//pi_cl_team_critical_exit();

}

inline static void __attribute__((always_inline)) cl_out(uint8_t *l2_out, uint8_t *l1_buffer, int sz){

	//pi_cl_team_critical_enter();

	pi_cl_dma_copy_t copy;

	copy.dir = PI_CL_DMA_DIR_LOC2EXT;
	copy.merge = 0;
	copy.size = (uint16_t) sz;
	copy.id = 0;
	copy.ext = (uint32_t)l2_out;
	copy.loc = (uint32_t)l1_buffer;

	pi_cl_dma_memcpy(&copy);
	pi_cl_dma_wait(&copy);

	//pi_cl_team_critical_exit();

}

inline static unsigned int __attribute__((always_inline)) ChunkSize(unsigned int X)

														{
	unsigned int NCore = pi_cl_team_nb_cores();
	unsigned int Log2Core =  __builtin_pulp_fl1(NCore);
	unsigned int Chunk = (X>>Log2Core) + ((X&(NCore-1))!=0);
	return Chunk;
														}


inline static void __attribute__((always_inline)) cl_L2L2_cpy
		(uint8_t *src, uint8_t *dst, int Chunksize, int Chunks)
		{

	/* copy L2 - L1 - L2 */
	uint8_t *tmp = (uint8_t *) pmsis_l1_malloc((uint32_t) Chunksize);

	for(int j=0;j<Chunks;j++){
		cl_in(&src[j*Chunksize],tmp,Chunksize);
		cl_out(&dst[j*Chunksize], tmp, Chunksize);
	}

	pmsis_l1_malloc_free(tmp, (uint32_t) Chunksize);

		}


void pulpCV_fc_morph(
		pulp_morph_Operator * pulpMorpop
){

#if PERF
	uint32_t tt;
	int ImgSize = pulpMorpop->W_IMG * pulpMorpop->H_IMG;
	pi_perf_conf(1 << PI_PERF_CYCLES | 1 << PI_PERF_ACTIVE_CYCLES);
	pi_perf_start();
	tt = pi_perf_read(PI_PERF_ACTIVE_CYCLES);
	printf("---------------\n");
#endif

	uint8_t	* input_frame = pulpMorpop->input_frame;
	uint8_t	* output_frame = pulpMorpop->output_frame;
	uint8_t	* out, * input;
	int W_IMG 	= pulpMorpop->W_IMG;
	int H_IMG	= pulpMorpop->H_IMG;
	e_morph_mod_e mode = pulpMorpop->mode;

	struct pi_device *dma_device;
	dma_device = (struct pi_device *) pi_l2_malloc((uint32_t) sizeof(struct pi_device));

	/* import kernel */
	m_kernel_t K;
	memcpy((void *)&K,(void *)pulpMorpop->K,sizeof(m_kernel_t));

	K.kernel = (uint8_t *) pi_fc_l1_malloc((uint32_t) K.Rows*K.Cols);
	memcpy((void *)K.kernel,(void *)pulpMorpop->K->kernel,(uint32_t) K.Rows*K.Cols);

	switch(mode){

	case MORPH_EROSION:

		if (input_frame == output_frame){
			/* intermediate buffer needed */
			out = (uint8_t *) pi_l2_malloc((uint32_t) W_IMG*H_IMG);
			if (out == NULL){printf("buff alloc failed !\r\n");pmsis_exit(-1);}
		}else{
			out = output_frame;
		}
		input = input_frame;

		Erosion
		(input, out, H_IMG, W_IMG, dma_device, K);

		if (input_frame == output_frame){
			L2L2_cpy(out, output_frame, W_IMG*H_IMG, dma_device);
			pmsis_l2_malloc_free(out, (uint32_t) W_IMG*H_IMG);
		}

		break;

	case MORPH_DILATION:

		if (input_frame == output_frame){
			/* intermediate buffer needed */
			out = (uint8_t *) pi_l2_malloc((uint32_t) W_IMG*H_IMG);
			if (out == NULL){printf("buff alloc failed !\r\n");pmsis_exit(-1);}
		}else{
			out = output_frame;
		}
		input = input_frame;

		Dilation
		(input, out, H_IMG, W_IMG, dma_device, K);

		if (input_frame == output_frame){
			L2L2_cpy(out, output_frame, W_IMG*H_IMG, dma_device);
			pmsis_l2_malloc_free(out, (uint32_t) W_IMG*H_IMG);
		}

		break;

	case MORPH_OPENING:

		if (input_frame == output_frame){

			out = (uint8_t *) pmsis_l2_malloc((uint32_t) W_IMG*H_IMG);
			if (out == NULL){printf("buff alloc failed !\r\n");pmsis_exit(-1);}

			Erosion
			(input_frame, out, H_IMG, W_IMG, dma_device, K);
			Dilation
			(out, output_frame, H_IMG, W_IMG, dma_device, K);

			pmsis_l2_malloc_free(out, (uint32_t) W_IMG*H_IMG);

		}else{

			Erosion
			(input_frame, output_frame, H_IMG, W_IMG, dma_device, K);
			Dilation
			(output_frame, input_frame, H_IMG, W_IMG, dma_device, K);

			L2L2_cpy(input_frame, output_frame, W_IMG*H_IMG, dma_device);

		}

		break;

	case MORPH_CLOSING:

		if (input_frame == output_frame){

			out = (uint8_t *) pmsis_l2_malloc((uint32_t) W_IMG*H_IMG);
			if (out == NULL){printf("buff alloc failed !\r\n");pmsis_exit(-1);}

			Dilation
			(input_frame, out, H_IMG, W_IMG, dma_device, K);
			Erosion
			(out, output_frame, H_IMG, W_IMG, dma_device, K);

			pmsis_l2_malloc_free(out, (uint32_t) W_IMG*H_IMG);

		}else{

			Dilation
			(input_frame, output_frame, H_IMG, W_IMG, dma_device, K);
			Erosion
			(output_frame, input_frame, H_IMG, W_IMG, dma_device, K);

			L2L2_cpy(input_frame, output_frame, W_IMG*H_IMG, dma_device);

		}

		break;

	default:
		return;
		break;

	}

	pi_fc_l1_free(K.kernel, (uint32_t) K.Rows*K.Cols);
	pmsis_l2_malloc_free(dma_device, (uint32_t) sizeof(struct pi_device));

#if PERF
	pi_perf_stop();
	printf("Morph Cycles %d \n",(int)(pi_perf_read(PI_PERF_ACTIVE_CYCLES)-tt));
	printf("Morph cpp %d \n",((int)(pi_perf_read(PI_PERF_ACTIVE_CYCLES)-tt))/((int)(ImgSize)));
#endif

}


/*for erosion */
void __pulpCV_cl_morph(
		pulp_morph_Operator * pulpMorpop
){

	uint32_t CoreId = pi_core_id();
	int NCores = pi_cl_cluster_nb_cores();

	/* import kernel */
	pulp_morph_Operator_cl *op = pulpMorpop->Kl1;
	op->K = *pulpMorpop->K;
	int W_IMG 	= pulpMorpop->W_IMG;
	int H_IMG	= pulpMorpop->H_IMG;
	e_morph_mod_e mode = pulpMorpop->mode;
	uint8_t	* input_frame = pulpMorpop->input_frame;
	uint8_t	* output_frame = pulpMorpop->output_frame;
	int insize = op->K.Rows * W_IMG;

	/* only core 0 */
	if (CoreId == 0){
		op->in = (uint8_t *) pmsis_l1_malloc((uint32_t) insize);
		op->out = (uint8_t *) pmsis_l1_malloc((uint32_t) W_IMG);
		uint8_t *tmp = op->K.kernel;
		op->K.kernel = (uint8_t *) pmsis_l1_malloc((uint32_t) op->K.Rows*op->K.Cols);
		cl_in(tmp, op->K.kernel, op->K.Rows*op->K.Cols);
	}

	int BLOCK_SIZE = W_IMG;
	int chunk = ChunkSize(BLOCK_SIZE);
	int First  = CoreId*chunk;
	int Last   = (First+chunk > BLOCK_SIZE) ? (BLOCK_SIZE) : (First+chunk);
	int ParBlocksize = Last - First;

	pi_cl_team_barrier();

	/*Erosion*/
	int RR = op->K.Rows - op->K.rowOrigin - 1;
	for (int currentRow = 0; currentRow < H_IMG ; currentRow++) {
		if ((currentRow >= op->K.rowOrigin) && (currentRow < (H_IMG - RR))){

			if (CoreId == 0){
				cl_in(&input_frame[(currentRow - op->K.rowOrigin)*W_IMG],
						op->in,
						insize);
			}

			pi_cl_team_barrier();

			Kernel_Col_Erosion(
					op->in + (op->K.rowOrigin*W_IMG),
					op->in,
					op->out,
					First,
					Last,
					W_IMG,
					&op->K);

		}else{
			memset(&op->out[First],0,ParBlocksize);
		}

		pi_cl_team_barrier();
		if (CoreId == 0){
			cl_out(&output_frame[currentRow*W_IMG], op->out, W_IMG);
		}
		pi_cl_team_barrier();

	}

	if (CoreId == 0){
		pmsis_l1_malloc_free(op->in, (uint32_t) insize);
		pmsis_l1_malloc_free(op->out, (uint32_t) W_IMG);
		pmsis_l1_malloc_free(op->K.kernel, (uint32_t) op->K.Rows*op->K.Cols);
	}

	pi_cl_team_barrier();

}

/* for dilation */
void __pulpCV_cl_morph_dilation(
		pulp_morph_Operator * pulpMorpop
){

	uint32_t CoreId = pi_core_id();
	int NCores = pi_cl_cluster_nb_cores();

	/* import kernel */
	pulp_morph_Operator_cl *op = pulpMorpop->Kl1;
	op->K = *pulpMorpop->K;
	int W_IMG 	= pulpMorpop->W_IMG;
	int H_IMG	= pulpMorpop->H_IMG;
	e_morph_mod_e mode = pulpMorpop->mode;
	uint8_t	* input_frame = pulpMorpop->input_frame;
	uint8_t	* output_frame = pulpMorpop->output_frame;
	int insize = op->K.Rows * W_IMG;

	/* only core 0 */
	if (CoreId == 0){
		op->in = (uint8_t *) pmsis_l1_malloc((uint32_t) insize);
		op->out = (uint8_t *) pmsis_l1_malloc((uint32_t) insize);
		uint8_t *tmp = op->K.kernel;
		op->K.kernel = (uint8_t *) pmsis_l1_malloc((uint32_t) op->K.Rows*op->K.Cols);
		cl_in(tmp, op->K.kernel, op->K.Rows*op->K.Cols);
	}

	int BLOCK_SIZE = W_IMG;
	int chunk = ChunkSize(BLOCK_SIZE);
	int First  = CoreId*chunk;
	int Last   = (First+chunk > BLOCK_SIZE) ? (BLOCK_SIZE) : (First+chunk);
	int ParBlocksize = Last - First;

	pi_cl_team_barrier();

	/*Erosion*/
	int RR = op->K.Rows - op->K.rowOrigin - 1;
	for (int currentRow = 0; currentRow < H_IMG ; currentRow++) {
		if ((currentRow >= op->K.rowOrigin) && (currentRow < (H_IMG - RR))){

			if (CoreId == 0){
				cl_in(&input_frame[(currentRow - op->K.rowOrigin)*W_IMG],
						op->in,
						insize);
				/* load output */
				cl_in(&output_frame[(currentRow - op->K.rowOrigin)*W_IMG],
						op->out,
						insize);
			}

			pi_cl_team_barrier();

			Kernel_Col_Dilation(
					op->in + (op->K.rowOrigin*W_IMG),
					op->in,
					op->out,
					First,
					Last,
					W_IMG,
					&op->K);

			pi_cl_team_barrier();

			if (CoreId == 0){
				cl_out(&output_frame[(currentRow - op->K.rowOrigin)*W_IMG],
						op->out,
						insize);
			}

		}else{

			if (CoreId == 0){
				// load one line
				cl_in(&input_frame[(currentRow)*W_IMG],
						op->in,
						W_IMG);
				/* store line */
				cl_out(&output_frame[(currentRow)*W_IMG],
						op->in,
						W_IMG);
			}

			pi_cl_team_barrier();

		}

	}

	if (CoreId == 0){
		pmsis_l1_malloc_free(op->in, (uint32_t) insize);
		pmsis_l1_malloc_free(op->out, (uint32_t) insize);
		pmsis_l1_malloc_free(op->K.kernel, (uint32_t) op->K.Rows*op->K.Cols);
	}

	pi_cl_team_barrier();

}


void pulpCV_cl_morph(
		pulp_morph_Operator * pulpMorpop
){

#if PERF
	uint32_t tt;
	int ImgSize = pulpMorpop->W_IMG * pulpMorpop->H_IMG;
	pi_perf_conf(1 << PI_PERF_CYCLES | 1 << PI_PERF_ACTIVE_CYCLES);
	printf("---------------\n");
	printf("Core controller: %d \n",pi_core_id());
	pi_perf_start();
	tt = pi_perf_read(PI_PERF_ACTIVE_CYCLES);
#endif

	/* import kernel */
	int W_IMG 	= pulpMorpop->W_IMG;
	int H_IMG	= pulpMorpop->H_IMG;
	e_morph_mod_e mode = pulpMorpop->mode;
	uint8_t	* input_frame = pulpMorpop->input_frame;
	uint8_t	* output_frame = pulpMorpop->output_frame;
	uint8_t	* out, * input;
	int NCores = pi_cl_cluster_nb_cores();

	/* L1 kernel */
	pulpMorpop->Kl1 = (pulp_morph_Operator_cl *) pmsis_l1_malloc((uint32_t) sizeof(pulp_morph_Operator_cl));
	if (pulpMorpop->Kl1 == NULL){printf("buff alloc failed !\n");pmsis_exit(-1);}

	switch(mode){

	case MORPH_EROSION:

		if (input_frame == output_frame){
			/* intermediate buffer needed */
			out = (uint8_t *) pi_l2_malloc((uint32_t) W_IMG*H_IMG);
			if (out == NULL){printf("buff alloc failed !\r\n");pmsis_exit(-1);}
		}else{
			out = output_frame;
		}
		input = input_frame;

		pulpMorpop->input_frame = input;
		pulpMorpop->output_frame = out;

		pi_cl_team_fork(
				NCores,
				(void *)__pulpCV_cl_morph, (void *) pulpMorpop);

		if (input_frame == output_frame){
			/* copy L2 - L2 */
			cl_L2L2_cpy(out, output_frame, W_IMG, H_IMG);
			pmsis_l2_malloc_free(out, (uint32_t) W_IMG*H_IMG);
		}

		pulpMorpop->input_frame = input_frame;
		pulpMorpop->output_frame = output_frame;

		break;

	case MORPH_DILATION:

		if (input_frame == output_frame){
			/* intermediate buffer needed */
			out = (uint8_t *) pi_l2_malloc((uint32_t) W_IMG*H_IMG);
			if (out == NULL){printf("buff alloc failed !\r\n");pmsis_exit(-1);}
		}else{
			out = output_frame;
		}
		input = input_frame;

		pulpMorpop->input_frame = input;
		pulpMorpop->output_frame = out;

		pi_cl_team_fork(
				NCores,
				(void *)__pulpCV_cl_morph_dilation, (void *) pulpMorpop);

		if (input_frame == output_frame){
			/* copy L2 - L2 */
			cl_L2L2_cpy(out, output_frame, W_IMG, H_IMG);
			pmsis_l2_malloc_free(out, (uint32_t) W_IMG*H_IMG);
		}

		pulpMorpop->input_frame = input_frame;
		pulpMorpop->output_frame = output_frame;

		break;

	case MORPH_OPENING:

		if (input_frame == output_frame){

			out = (uint8_t *) pmsis_l2_malloc((uint32_t) W_IMG*H_IMG);
			if (out == NULL){printf("buff alloc failed !\r\n");pmsis_exit(-1);}

			/*Erosion*/
			//(input_frame, out, H_IMG, W_IMG, dma_device, K);
			pulpMorpop->output_frame = out;
			pi_cl_team_fork(
					NCores,
					(void *)__pulpCV_cl_morph, (void *) pulpMorpop);
			/*Dilation*/
			//(out, output_frame, H_IMG, W_IMG, dma_device, K);
			pulpMorpop->input_frame = out;
			pulpMorpop->output_frame = output_frame;
			pi_cl_team_fork(
					NCores,
					(void *)__pulpCV_cl_morph_dilation, (void *) pulpMorpop);

			pmsis_l2_malloc_free(out, (uint32_t) W_IMG*H_IMG);

			pulpMorpop->input_frame = input_frame;
			pulpMorpop->output_frame = output_frame;

		}else{

			/* Erosion */
			//(input_frame, output_frame, H_IMG, W_IMG, dma_device, K);
			pi_cl_team_fork(
					NCores,
					(void *)__pulpCV_cl_morph, (void *) pulpMorpop);
			/* Dilation */
			//(output_frame, input_frame, H_IMG, W_IMG, dma_device, K);
			pulpMorpop->input_frame = output_frame;
			pulpMorpop->output_frame = input_frame;
			pi_cl_team_fork(
					NCores,
					(void *)__pulpCV_cl_morph_dilation, (void *) pulpMorpop);

			/* copy L2 - L2 */
			cl_L2L2_cpy(input_frame, output_frame, W_IMG, H_IMG);

			pulpMorpop->input_frame = input_frame;
			pulpMorpop->output_frame = output_frame;

		}

		break;

	case MORPH_CLOSING:

		if (input_frame == output_frame){

			out = (uint8_t *) pmsis_l2_malloc((uint32_t) W_IMG*H_IMG);
			if (out == NULL){printf("buff alloc failed !\r\n");pmsis_exit(-1);}

			/*Erosion*/
			//(input_frame, out, H_IMG, W_IMG, dma_device, K);
			pulpMorpop->output_frame = out;
			pi_cl_team_fork(
					NCores,
					(void *)__pulpCV_cl_morph_dilation, (void *) pulpMorpop);
			/*Dilation*/
			//(out, output_frame, H_IMG, W_IMG, dma_device, K);
			pulpMorpop->input_frame = out;
			pulpMorpop->output_frame = output_frame;
			pi_cl_team_fork(
					NCores,
					(void *)__pulpCV_cl_morph, (void *) pulpMorpop);

			pmsis_l2_malloc_free(out, (uint32_t) W_IMG*H_IMG);

			pulpMorpop->input_frame = input_frame;
			pulpMorpop->output_frame = output_frame;

		}else{

			/* Erosion */
			//(input_frame, output_frame, H_IMG, W_IMG, dma_device, K);
			pi_cl_team_fork(
					NCores,
					(void *)__pulpCV_cl_morph_dilation, (void *) pulpMorpop);
			/* Dilation */
			//(output_frame, input_frame, H_IMG, W_IMG, dma_device, K);
			pulpMorpop->input_frame = output_frame;
			pulpMorpop->output_frame = input_frame;
			pi_cl_team_fork(
					NCores,
					(void *)__pulpCV_cl_morph, (void *) pulpMorpop);

			/* copy L2 - L2 */
			cl_L2L2_cpy(input_frame, output_frame, W_IMG, H_IMG);

			pulpMorpop->input_frame = input_frame;
			pulpMorpop->output_frame = output_frame;

		}

		break;

	default:
		return;
		break;

	}

	pmsis_l1_malloc_free(pulpMorpop->Kl1, (uint32_t) sizeof(pulp_morph_Operator_cl));

#if PERF
	pi_perf_stop();
	printf("Morph Cycles %d \n",(int)(pi_perf_read(PI_PERF_ACTIVE_CYCLES)-tt));
	printf("Morph cpp %d \n",((int)(pi_perf_read(PI_PERF_ACTIVE_CYCLES)-tt))/((int)(ImgSize)));
#endif

}


